Data Import

Load raw data (from https://www.kaggle.com/ludobenistant/hr-analytics), add factors, save.

HR <- read_csv("HR_comma_sep.csv", col_types=cols(
  satisfaction_level = col_double(),
  last_evaluation = col_double(),
  number_project = col_integer(),
  average_montly_hours = col_integer(),
  time_spend_company = col_integer(),
  Work_accident = col_integer(),
  left = col_integer(),
  promotion_last_5years = col_integer(),
  sales = col_character(),
  salary = col_character()
))
HR <- plyr::rename(HR, replace=c("satisfaction_level"="Satis", "last_evaluation"="Eval",
            "number_project"="NumProj", "average_montly_hours"="MonHrs",
            "time_spend_company"="Tenure", "Work_accident"="Accdt01", 
            "left"="Left01", "promotion_last_5years"="Promo5yr01",
            "sales"="Dept", "salary"="Salary"))

HR$Salary <- factor(HR$Salary, levels=c("low", "medium", "high"), ordered=TRUE)
HR$Dept <- factor(HR$Dept)

HR$Accdt <- factor(HR$Accdt01 == 1)
HR$Promo5yr <- factor(HR$Promo5yr01 == 1)
HR$Left <- factor(HR$Left01==1)

set.seed(42)
HR$location <- factor(sapply(HR$Dept, location))
write_csv(HR, "HR.csv")

Data Summary

Dimensional Summary

summary(HR)
##      Satis             Eval           NumProj          MonHrs     
##  Min.   :0.0900   Min.   :0.3600   Min.   :2.000   Min.   : 96.0  
##  1st Qu.:0.4400   1st Qu.:0.5600   1st Qu.:3.000   1st Qu.:156.0  
##  Median :0.6400   Median :0.7200   Median :4.000   Median :200.0  
##  Mean   :0.6128   Mean   :0.7161   Mean   :3.803   Mean   :201.1  
##  3rd Qu.:0.8200   3rd Qu.:0.8700   3rd Qu.:5.000   3rd Qu.:245.0  
##  Max.   :1.0000   Max.   :1.0000   Max.   :7.000   Max.   :310.0  
##                                                                   
##      Tenure          Accdt01           Left01         Promo5yr01     
##  Min.   : 2.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 3.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median : 3.000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   : 3.498   Mean   :0.1446   Mean   :0.2381   Mean   :0.02127  
##  3rd Qu.: 4.000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :10.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                      
##           Dept         Salary       Accdt        Promo5yr    
##  sales      :4140   low   :7316   FALSE:12830   FALSE:14680  
##  technical  :2720   medium:6446   TRUE : 2169   TRUE :  319  
##  support    :2229   high  :1237                              
##  IT         :1227                                            
##  product_mng: 902                                            
##  marketing  : 858                                            
##  (Other)    :2923                                            
##     Left                location   
##  FALSE:11428   Brisbane     :1801  
##  TRUE : 3571   London       :8925  
##                San Francisco:4273  
##                                    
##                                    
##                                    
## 

Pairwise Scatter Plots and Distributions for Staying vs Leaving Staff

plot.new()
png(filename="hr-scatter.png", res=300, width = 3000, height = 3000)
my_colors <- brewer.pal(3, "Set2")
samp.sz <- 3000
pct <- round(100*samp.sz/nrow(HR))
data <- HR[sample(nrow(HR), samp.sz),]
scatterplotMatrix(~Satis+Eval+NumProj+MonHrs+Tenure+Accdt01+Promo5yr01+Dept+Salary|Left01, data=data, reg.line="", smoother="", col=my_colors , smoother.args=list(col="grey") , pch=c(3,4), legend.plot=FALSE, main=paste("Scatter Plot Pairs for HR Turnover (",pct,"% of data)", sep=""))
par(xpd=TRUE, cex=0.7)
legend(x=0.92, y=1.16, c("Stay", "Leave"), col=my_colors, pch=c(3,4), horiz=TRUE)
dev.off()

Pairwise Numeric Correlations within Full Dataset

Leaving is negatively correlated with:

  • satisfaction level (-0.39)
  • having a work accident (-0.15)
  • and less so with a promotion in the last 5 years (-0.06)

Leaving is positively correlated with:

  • tenure (0.14)
  • and less so with average hours worked per month (0.07)

In addition:

  • satisfaction level is correlated with last evaluation (0.11), and negatively with number of projects (-0.14) and tenure (-0.10)
  • last evaluation is positively correlated with number of projects (0.35), average monthly hours (0.34) and tenure (0.13)
M <- cor(HR[1:8])
corrplot(M, method = 'pie', order ="hclust",
         tl.col="black", tl.cex = 1, tl.offset = 0.1, tl.srt = 45, addrect=2)

Pairwise Numeric Correlations for Leaving Employees

M <- cor(HR[HR$Left==TRUE,c(1:6,8)])
corrplot(M, method = 'pie', order ="hclust",
         tl.col="black", tl.cex = 1, tl.offset = 0.1, tl.srt = 45, addrect=2)

Pairwise Numeric Correlations for Staying Employees

M <- cor(HR[HR$Left==FALSE,c(1:6,8)])
corrplot(M, method = 'pie', order ="hclust",
         tl.col="black", tl.cex = 1, tl.offset = 0.1, tl.srt = 45, addrect=2)

Exploratory Data Analysis

Career advancement

Many people leaving have been with the company at least 4 years but have not had a promotion in the last 5 years, despite working long hours.

with(HR, coplot(MonHrs ~ jitter(Tenure) |
                Promo5yr + Left))

These unpromoted, long-term workers who leave are receiving the highest evaluations.

with(HR, coplot(Eval ~ jitter(Tenure) |
                Promo5yr + Left))

These unpromoted, long-term workers who leave are also reporting high levels of satisfaction with the company.

with(HR, coplot(Satis ~ jitter(Tenure) |
                Promo5yr + Left))

Of the unpromoted workers putting in long hours who are leaving, some report the lowest levels of satisfaction, but more report very high levels of satisfaction.

The ones who are working the fewest hours report fairly low satisfaction.

with(HR, coplot(MonHrs ~ Satis |
                Promo5yr + Left))

The staff on the largest number of projects and who put in the longest hours, with no promotion in 5 years, tend to leave.

with(HR, coplot(MonHrs ~ jitter(NumProj) |
                Promo5yr + Left))

Of the unpromoted workers who left, many had both high evaluations and high levels of satisfaction.

with(HR, coplot(Eval ~ Satis |
                Promo5yr + Left))

Plot: evaluation, satisfaction | hours + Left evaluation, satisfaction | time with co + Left evaluation, satisfaction | NumProj + Left